# Import required libraries and dependencies
import pandas as pd
import hvplot.pandas
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
# Load the data into a Pandas DataFrame
df_market_data = pd.read_csv(
"Resources/crypto_market_data.csv",
index_col="coin_id")
# Display sample data
df_market_data.head(40)
| price_change_percentage_24h | price_change_percentage_7d | price_change_percentage_14d | price_change_percentage_30d | price_change_percentage_60d | price_change_percentage_200d | price_change_percentage_1y | |
|---|---|---|---|---|---|---|---|
| coin_id | |||||||
| bitcoin | 1.08388 | 7.60278 | 6.57509 | 7.67258 | -3.25185 | 83.51840 | 37.51761 |
| ethereum | 0.22392 | 10.38134 | 4.80849 | 0.13169 | -12.88890 | 186.77418 | 101.96023 |
| tether | -0.21173 | 0.04935 | 0.00640 | -0.04237 | 0.28037 | -0.00542 | 0.01954 |
| ripple | -0.37819 | -0.60926 | 2.24984 | 0.23455 | -17.55245 | 39.53888 | -16.60193 |
| bitcoin-cash | 2.90585 | 17.09717 | 14.75334 | 15.74903 | -13.71793 | 21.66042 | 14.49384 |
| binancecoin | 2.10423 | 12.85511 | 6.80688 | 0.05865 | 36.33486 | 155.61937 | 69.69195 |
| chainlink | -0.23935 | 20.69459 | 9.30098 | -11.21747 | -43.69522 | 403.22917 | 325.13186 |
| cardano | 0.00322 | 13.99302 | 5.55476 | 10.10553 | -22.84776 | 264.51418 | 156.09756 |
| litecoin | -0.06341 | 6.60221 | 7.28931 | 1.21662 | -17.23960 | 27.49919 | -12.66408 |
| bitcoin-cash-sv | 0.92530 | 3.29641 | -1.86656 | 2.88926 | -24.87434 | 7.42562 | 93.73082 |
| crypto-com-chain | 0.61209 | -5.67151 | -8.53948 | -17.44782 | -16.47600 | 226.70782 | 305.05908 |
| usd-coin | -0.17825 | -0.11871 | -0.00568 | -0.16584 | 0.04271 | -0.15691 | -0.19205 |
| eos | 0.14477 | -1.31177 | 1.13751 | -4.63398 | -30.16898 | 18.06111 | -17.56753 |
| monero | 0.42961 | 15.78515 | 18.41097 | 38.95974 | 41.72500 | 169.52147 | 141.04116 |
| tron | 0.07647 | 4.23886 | 1.40337 | -12.60389 | 5.52545 | 132.88436 | 59.23821 |
| tezos | -0.67316 | 8.95665 | 2.32062 | -14.12663 | -44.82248 | 43.42842 | 140.01279 |
| okb | -2.72700 | -4.55389 | -5.02662 | -10.43847 | -2.83120 | 39.95853 | 141.95791 |
| stellar | -1.00843 | 2.07149 | -1.08217 | -8.12933 | -30.80369 | 84.62157 | 13.80715 |
| cosmos | -0.95103 | 16.08534 | 5.51074 | 4.57813 | -7.20130 | 185.99786 | 82.43833 |
| cdai | 0.21169 | 0.05820 | 0.17076 | -2.18147 | 0.65726 | -0.39210 | -0.28783 |
| neo | 0.49302 | 2.44243 | -9.84803 | -21.95472 | 13.51879 | 158.64773 | 131.29655 |
| wrapped-bitcoin | 1.10231 | 7.40537 | 6.55668 | 7.37557 | -3.58772 | 83.90520 | 37.53424 |
| leo-token | -0.13192 | -1.34886 | -7.02859 | 3.07525 | -7.54455 | 16.40588 | 21.00263 |
| huobi-token | -0.40818 | 1.61798 | 0.45488 | -3.25488 | -3.40689 | 42.23704 | 24.57164 |
| nem | -0.84990 | -0.85140 | 2.64844 | -3.65382 | 82.86094 | 216.17761 | 200.71797 |
| binance-usd | -0.10642 | 0.04726 | 0.05902 | 0.01843 | 0.09383 | 0.09959 | 0.13928 |
| iota | 0.41996 | 6.06830 | -3.77714 | -4.15281 | -34.51894 | 95.02821 | -3.98533 |
| vechain | 1.28766 | -1.76352 | -18.15890 | -14.16831 | -43.62359 | 269.70264 | 202.86827 |
| zcash | -0.60897 | 10.34780 | 5.79179 | 7.37007 | -20.54216 | 122.54767 | 82.88499 |
| theta-token | -4.56089 | -6.09456 | -6.57354 | 31.43355 | 80.03112 | 882.65105 | 701.37599 |
| dash | -1.06006 | 5.09387 | 0.51708 | -7.79140 | -26.22460 | 6.23435 | -2.45897 |
| ethereum-classic | -0.45950 | 3.05209 | -1.26669 | 2.89572 | -25.90799 | 10.39203 | 11.11094 |
| ethlend | -13.52786 | 4.21266 | -9.80075 | -29.99499 | 2.13917 | 2227.92782 | 7852.08970 |
| maker | -0.60285 | 8.15400 | -1.48854 | 16.71360 | -18.61722 | 82.53544 | 10.65279 |
| havven | -4.07216 | 4.33651 | -13.29164 | -20.84154 | -39.42657 | 622.92465 | 678.78427 |
| omisego | 4.84033 | 6.82985 | -13.21636 | -9.11552 | 79.27505 | 585.26307 | 320.69054 |
| celsius-degree-token | 2.51323 | 0.60354 | 24.23919 | 140.79570 | 223.06437 | 1590.19149 | 2009.72217 |
| ontology | -1.35845 | -1.21399 | -10.56222 | -34.70548 | -32.70004 | 54.76717 | -12.65493 |
| ftx-token | 0.83416 | 7.10438 | -0.20629 | -10.56394 | 5.90295 | 57.48950 | 168.37251 |
| true-usd | -0.06197 | 0.16642 | 0.10974 | 0.03090 | 0.25154 | -0.08874 | 0.40617 |
# Generate summary statistics
df_market_data.describe()
| price_change_percentage_24h | price_change_percentage_7d | price_change_percentage_14d | price_change_percentage_30d | price_change_percentage_60d | price_change_percentage_200d | price_change_percentage_1y | |
|---|---|---|---|---|---|---|---|
| count | 41.000000 | 41.000000 | 41.000000 | 41.000000 | 41.000000 | 41.000000 | 41.000000 |
| mean | -0.269686 | 4.497147 | 0.185787 | 1.545693 | -0.094119 | 236.537432 | 347.667956 |
| std | 2.694793 | 6.375218 | 8.376939 | 26.344218 | 47.365803 | 435.225304 | 1247.842884 |
| min | -13.527860 | -6.094560 | -18.158900 | -34.705480 | -44.822480 | -0.392100 | -17.567530 |
| 25% | -0.608970 | 0.047260 | -5.026620 | -10.438470 | -25.907990 | 21.660420 | 0.406170 |
| 50% | -0.063410 | 3.296410 | 0.109740 | -0.042370 | -7.544550 | 83.905200 | 69.691950 |
| 75% | 0.612090 | 7.602780 | 5.510740 | 4.578130 | 0.657260 | 216.177610 | 168.372510 |
| max | 4.840330 | 20.694590 | 24.239190 | 140.795700 | 223.064370 | 2227.927820 | 7852.089700 |
# Plot your data to see what's in your DataFrame
df_market_data.hvplot.line(
width=800,
height=400,
rot=90
)
# Use the `StandardScaler()` module from scikit-learn to normalize the data from the CSV file
df_market_data_scaled = StandardScaler().fit_transform(df_market_data[['price_change_percentage_24h','price_change_percentage_7d','price_change_percentage_14d','price_change_percentage_30d','price_change_percentage_60d','price_change_percentage_200d','price_change_percentage_1y']])
# Create a DataFrame with the scaled data
df_market_data_scaled = pd.DataFrame(df_market_data_scaled,columns=['price_change_percentage_24h','price_change_percentage_7d','price_change_percentage_14d','price_change_percentage_30d','price_change_percentage_60d','price_change_percentage_200d','price_change_percentage_1y'])
# Copy the crypto names from the original data
crypto_name_index = df_market_data.index
# Set the coinid column as index
df_market_data_scaled.index = crypto_name_index
# Display sample data
df_market_data_scaled
| price_change_percentage_24h | price_change_percentage_7d | price_change_percentage_14d | price_change_percentage_30d | price_change_percentage_60d | price_change_percentage_200d | price_change_percentage_1y | |
|---|---|---|---|---|---|---|---|
| coin_id | |||||||
| bitcoin | 0.508529 | 0.493193 | 0.772200 | 0.235460 | -0.067495 | -0.355953 | -0.251637 |
| ethereum | 0.185446 | 0.934445 | 0.558692 | -0.054341 | -0.273483 | -0.115759 | -0.199352 |
| tether | 0.021774 | -0.706337 | -0.021680 | -0.061030 | 0.008005 | -0.550247 | -0.282061 |
| ripple | -0.040764 | -0.810928 | 0.249458 | -0.050388 | -0.373164 | -0.458259 | -0.295546 |
| bitcoin-cash | 1.193036 | 2.000959 | 1.760610 | 0.545842 | -0.291203 | -0.499848 | -0.270317 |
| binancecoin | 0.891871 | 1.327295 | 0.800214 | -0.057148 | 0.778653 | -0.188232 | -0.225533 |
| chainlink | 0.011397 | 2.572251 | 1.101647 | -0.490495 | -0.931954 | 0.387759 | -0.018284 |
| cardano | 0.102530 | 1.508001 | 0.648885 | 0.328959 | -0.486349 | 0.065080 | -0.155428 |
| litecoin | 0.077497 | 0.334297 | 0.858520 | -0.012646 | -0.366477 | -0.486266 | -0.292351 |
| bitcoin-cash-sv | 0.448952 | -0.190684 | -0.248043 | 0.051634 | -0.529666 | -0.532961 | -0.206029 |
| crypto-com-chain | 0.331280 | -1.614844 | -1.054521 | -0.729931 | -0.350155 | -0.022866 | -0.034570 |
| usd-coin | 0.034352 | -0.733026 | -0.023140 | -0.065775 | 0.002925 | -0.550599 | -0.282232 |
| eos | 0.155710 | -0.922491 | 0.115024 | -0.237488 | -0.642837 | -0.508220 | -0.296330 |
| monero | 0.262723 | 1.792602 | 2.202665 | 1.437842 | 0.893865 | -0.155893 | -0.167644 |
| tron | 0.130050 | -0.041018 | 0.147155 | -0.543776 | 0.120116 | -0.241118 | -0.234014 |
| tezos | -0.151583 | 0.708196 | 0.258012 | -0.602296 | -0.956049 | -0.449211 | -0.168479 |
| okb | -0.923203 | -1.437359 | -0.629963 | -0.460558 | -0.058504 | -0.457283 | -0.166900 |
| stellar | -0.277543 | -0.385209 | -0.153243 | -0.371816 | -0.656403 | -0.353387 | -0.270874 |
| cosmos | -0.255978 | 1.840274 | 0.643565 | 0.116538 | -0.151913 | -0.117565 | -0.215191 |
| cdai | 0.180851 | -0.704931 | -0.001816 | -0.143237 | 0.016060 | -0.551146 | -0.282310 |
| neo | 0.286546 | -0.326301 | -1.212670 | -0.903134 | 0.290970 | -0.181187 | -0.175550 |
| wrapped-bitcoin | 0.515453 | 0.461843 | 0.769975 | 0.224045 | -0.074674 | -0.355054 | -0.251623 |
| leo-token | 0.051758 | -0.928381 | -0.871918 | 0.058782 | -0.159250 | -0.512071 | -0.265036 |
| huobi-token | -0.052032 | -0.457229 | 0.032522 | -0.184489 | -0.070809 | -0.451982 | -0.262140 |
| nem | -0.217984 | -0.849381 | 0.297632 | -0.199820 | 1.773127 | -0.047361 | -0.119226 |
| binance-usd | 0.061339 | -0.706669 | -0.015321 | -0.058694 | 0.004017 | -0.550003 | -0.281963 |
| iota | 0.259097 | 0.249508 | -0.478953 | -0.218997 | -0.735815 | -0.329179 | -0.285310 |
| vechain | 0.585089 | -0.994231 | -2.217108 | -0.603898 | -0.930423 | 0.077149 | -0.117482 |
| zcash | -0.127467 | 0.929119 | 0.677532 | 0.223834 | -0.437068 | -0.265163 | -0.214829 |
| theta-token | -1.612188 | -1.682027 | -0.816921 | 1.148607 | 1.712641 | 1.502992 | 0.286977 |
| dash | -0.296940 | 0.094763 | 0.040040 | -0.358830 | -0.558527 | -0.535732 | -0.284071 |
| ethereum-classic | -0.071312 | -0.229484 | -0.175544 | 0.051882 | -0.551760 | -0.526060 | -0.273062 |
| ethlend | -4.981042 | -0.045178 | -1.206956 | -1.212126 | 0.047736 | 4.632380 | 6.088625 |
| maker | -0.125168 | 0.580730 | -0.202356 | 0.582911 | -0.395923 | -0.358240 | -0.273433 |
| havven | -1.428574 | -0.025510 | -1.628859 | -0.860354 | -0.840714 | 0.898815 | 0.268647 |
| omisego | 1.919812 | 0.370447 | -1.619761 | -0.409716 | 1.696480 | 0.811207 | -0.021888 |
| celsius-degree-token | 1.045530 | -0.618328 | 2.907054 | 5.351455 | 4.769913 | 3.148875 | 1.348488 |
| ontology | -0.409044 | -0.906963 | -1.298986 | -1.393153 | -0.696937 | -0.422835 | -0.292344 |
| ftx-token | 0.414711 | 0.414044 | -0.047386 | -0.465380 | 0.128185 | -0.416502 | -0.145469 |
| true-usd | 0.078038 | -0.687745 | -0.009191 | -0.058214 | 0.007388 | -0.550441 | -0.281747 |
| digibyte | 1.217453 | -0.607714 | -0.907066 | 0.449939 | -0.662530 | 0.572367 | -0.132482 |
crypto_name_index
Index(['bitcoin', 'ethereum', 'tether', 'ripple', 'bitcoin-cash',
'binancecoin', 'chainlink', 'cardano', 'litecoin', 'bitcoin-cash-sv',
'crypto-com-chain', 'usd-coin', 'eos', 'monero', 'tron', 'tezos', 'okb',
'stellar', 'cosmos', 'cdai', 'neo', 'wrapped-bitcoin', 'leo-token',
'huobi-token', 'nem', 'binance-usd', 'iota', 'vechain', 'zcash',
'theta-token', 'dash', 'ethereum-classic', 'ethlend', 'maker', 'havven',
'omisego', 'celsius-degree-token', 'ontology', 'ftx-token', 'true-usd',
'digibyte'],
dtype='object', name='coin_id')
# Create a list with the number of k-values from 1 to 11
k = list(range(1,11))
k
[1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
# Create an empty list to store the inertia values
inertia =[]
# Create a for loop to compute the inertia with each possible value of k
for i in k:
k_model = KMeans(n_clusters = i,random_state= 1)
k_model.fit(df_market_data_scaled)
inertia.append(k_model.inertia_)
# Inside the loop:
# 1. Create a KMeans model using the loop counter for the n_clusters
# 2. Fit the model to the data using `df_market_data_scaled`
# 3. Append the model.inertia_ to the inertia list
C:\Users\samit\anaconda3\lib\site-packages\sklearn\cluster\_kmeans.py:1036: UserWarning: KMeans is known to have a memory leak on Windows with MKL, when there are less chunks than available threads. You can avoid it by setting the environment variable OMP_NUM_THREADS=1. warnings.warn(
# Create a dictionary with the data to plot the Elbow curve
elbow_data = {"K": k,"Inertia": inertia}
df_elbow_scaled = pd.DataFrame(elbow_data)
# Create a DataFrame with the data to plot the Elbow curve
df_elbow_scaled.head()
| K | Inertia | |
|---|---|---|
| 0 | 1 | 287.000000 |
| 1 | 2 | 195.820218 |
| 2 | 3 | 123.190482 |
| 3 | 4 | 79.022435 |
| 4 | 5 | 65.405923 |
# Plot a line chart with all the inertia values computed with
elbow_plot_orginal = df_elbow_scaled.hvplot.line(
x="K",
y="Inertia",
title="Elbow Curve",
xticks=k)
df_elbow_scaled.hvplot.line(
x="K",
y="Inertia",
title="Elbow Curve",
xticks=k)
# the different values of k to visually identify the optimal value for k.
Question: What is the best value for k?
Answer: the best value for K would be 4 where the marginal benefit of additional cluster (K) provides least intertia
# Initialize the K-Means model using the best value for k
model = KMeans(n_clusters= 4,random_state=1)
# Fit the K-Means model using the scaled data
model.fit(df_market_data_scaled)
KMeans(n_clusters=4, random_state=1)
# Predict the clusters to group the cryptocurrencies using the scaled data
k_lower = model.predict(df_market_data_scaled)
# Print the resulting array of cluster values.
k_lower
array([3, 3, 1, 1, 3, 3, 3, 3, 3, 1, 1, 1, 1, 3, 1, 3, 1, 1, 3, 1, 1, 3,
1, 1, 1, 1, 1, 1, 3, 1, 1, 1, 0, 3, 1, 1, 2, 1, 1, 1, 1])
# Create a copy of the DataFrame
df_market_data_scaled_prediction = df_market_data_scaled.copy()
# Add a new column to the DataFrame with the predicted clusters
df_market_data_scaled_prediction['cluster_lower']=k_lower
# Display sample data
df_market_data_scaled_prediction.head()
| price_change_percentage_24h | price_change_percentage_7d | price_change_percentage_14d | price_change_percentage_30d | price_change_percentage_60d | price_change_percentage_200d | price_change_percentage_1y | cluster_lower | |
|---|---|---|---|---|---|---|---|---|
| coin_id | ||||||||
| bitcoin | 0.508529 | 0.493193 | 0.772200 | 0.235460 | -0.067495 | -0.355953 | -0.251637 | 3 |
| ethereum | 0.185446 | 0.934445 | 0.558692 | -0.054341 | -0.273483 | -0.115759 | -0.199352 | 3 |
| tether | 0.021774 | -0.706337 | -0.021680 | -0.061030 | 0.008005 | -0.550247 | -0.282061 | 1 |
| ripple | -0.040764 | -0.810928 | 0.249458 | -0.050388 | -0.373164 | -0.458259 | -0.295546 | 1 |
| bitcoin-cash | 1.193036 | 2.000959 | 1.760610 | 0.545842 | -0.291203 | -0.499848 | -0.270317 | 3 |
# Create a scatter plot using hvPlot by setting
# `x="price_change_percentage_24h"` and `y="price_change_percentage_7d"`.
# Color the graph points with the labels found using K-Means and
# add the crypto name in the `hover_cols` parameter to identify
# the cryptocurrency represented by each data point.
cluster_plot_original = df_market_data_scaled_prediction.hvplot.scatter(
x="price_change_percentage_24h",
y="price_change_percentage_7d",
by="cluster_lower",
hover_cols ="coin_id"
).opts(yformatter = "%.0f")
df_market_data_scaled_prediction.hvplot.scatter(
x="price_change_percentage_24h",
y="price_change_percentage_7d",
by="cluster_lower",
hover_cols ="coin_id"
).opts(yformatter = "%.0f")
# Create a PCA model instance and set `n_components=3`.
pca = PCA(n_components=3)
# Use the PCA model with `fit_transform` to reduce to
# three principal components.
market_data_pca = pca.fit_transform(df_market_data_scaled)
# View the first five rows of the DataFrame.
market_data_pca[:5]
array([[-0.60066733, 0.84276006, 0.46159457],
[-0.45826071, 0.45846566, 0.95287678],
[-0.43306981, -0.16812638, -0.64175193],
[-0.47183495, -0.22266008, -0.47905316],
[-1.15779997, 2.04120919, 1.85971527]])
# Retrieve the explained variance to determine how much information
# can be attributed to each principal component.
pca.explained_variance_ratio_
array([0.3719856 , 0.34700813, 0.17603793])
Question: What is the total explained variance of the three principal components?
Answer: about 88% of the total variance is condensed into the 3 PCA variables
# Create a new DataFrame with the PCA data.
df_market_data_pca = pd.DataFrame(market_data_pca,columns = ['PCA1','PCA2','PCA3'])
# Creating a DataFrame with the PCA data
# Copy the crypto names from the original data
df_market_data_pca.index = crypto_name_index
# Set the coinid column as index
# Display sample data
df_market_data_pca
| PCA1 | PCA2 | PCA3 | |
|---|---|---|---|
| coin_id | |||
| bitcoin | -0.600667 | 0.842760 | 0.461595 |
| ethereum | -0.458261 | 0.458466 | 0.952877 |
| tether | -0.433070 | -0.168126 | -0.641752 |
| ripple | -0.471835 | -0.222660 | -0.479053 |
| bitcoin-cash | -1.157800 | 2.041209 | 1.859715 |
| binancecoin | -0.516534 | 1.388377 | 0.804071 |
| chainlink | -0.450711 | 0.517699 | 2.846143 |
| cardano | -0.345600 | 0.729439 | 1.478013 |
| litecoin | -0.649468 | 0.432165 | 0.600303 |
| bitcoin-cash-sv | -0.759014 | -0.201200 | -0.217653 |
| crypto-com-chain | -0.248198 | -1.376252 | -1.462026 |
| usd-coin | -0.438408 | -0.175337 | -0.663388 |
| eos | -0.693425 | -0.473815 | -0.527597 |
| monero | 0.060499 | 2.909404 | 1.498571 |
| tron | -0.393352 | -0.108192 | -0.012756 |
| tezos | -0.796176 | -0.494409 | 1.082812 |
| okb | 0.064075 | -1.269825 | -1.098829 |
| stellar | -0.489015 | -0.732719 | -0.062543 |
| cosmos | -0.306272 | 0.703415 | 1.714224 |
| cdai | -0.513528 | -0.142802 | -0.656566 |
| neo | -0.362120 | -0.986914 | -0.728752 |
| wrapped-bitcoin | -0.604265 | 0.827398 | 0.439316 |
| leo-token | -0.413296 | -0.674115 | -1.076628 |
| huobi-token | -0.407483 | -0.212507 | -0.351426 |
| nem | 0.608974 | 0.563532 | -1.148742 |
| binance-usd | -0.450211 | -0.151019 | -0.647401 |
| iota | -0.764665 | -0.517886 | 0.204990 |
| vechain | -0.556315 | -1.938209 | -1.261776 |
| zcash | -0.425147 | 0.492976 | 1.058048 |
| theta-token | 2.676868 | -0.013954 | -1.965207 |
| dash | -0.613923 | -0.479337 | 0.339565 |
| ethereum-classic | -0.579924 | -0.356334 | -0.114942 |
| ethlend | 8.089018 | -3.896891 | 2.301382 |
| maker | -0.389045 | 0.165041 | 0.379414 |
| havven | 0.865762 | -2.261882 | 0.275583 |
| omisego | 0.111675 | 0.428316 | -1.205398 |
| celsius-degree-token | 4.792395 | 6.767679 | -1.986985 |
| ontology | -0.632355 | -2.108117 | -0.652227 |
| ftx-token | -0.593142 | 0.021485 | 0.209911 |
| true-usd | -0.458131 | -0.135734 | -0.635284 |
| digibyte | -0.297910 | -0.191126 | -0.909602 |
# Create a list with the number of k-values from 1 to 11
k = list(range(1,11))
# Create an empty list to store the inertia values
inertia=[]
# Create a for loop to compute the inertia with each possible value of k
for i in k:
k_model = KMeans(n_clusters=i,random_state=1)
k_model.fit(df_market_data_pca)
inertia.append(k_model.inertia_)
# Inside the loop:
# 1. Create a KMeans model using the loop counter for the n_clusters
# 2. Fit the model to the data using `df_market_data_pca`
# 3. Append the model.inertia_ to the inertia list
C:\Users\samit\anaconda3\lib\site-packages\sklearn\cluster\_kmeans.py:1036: UserWarning: KMeans is known to have a memory leak on Windows with MKL, when there are less chunks than available threads. You can avoid it by setting the environment variable OMP_NUM_THREADS=1. warnings.warn(
# Create a dictionary with the data to plot the Elbow curve
elbow_data = {"k":k,"inertia":inertia}
df_elbow = pd.DataFrame(elbow_data)
# Create a DataFrame with the data to plot the Elbow curve
df_elbow.head()
| k | inertia | |
|---|---|---|
| 0 | 1 | 256.874086 |
| 1 | 2 | 165.901994 |
| 2 | 3 | 93.774626 |
| 3 | 4 | 49.665497 |
| 4 | 5 | 38.352251 |
# Plot a line chart with all the inertia values computed with
# the different values of k to visually identify the optimal value for k.
elbow_pca = df_elbow.hvplot.line(
x="k",
y="inertia",
title="Elbow Curve",
xticks=k
)
df_elbow.hvplot.line(
x="k",
y="inertia",
title="Elbow Curve",
xticks=k
)
Question: What is the best value for k when using the PCA data?
Question: Does it differ from the best k value found using the original data?
# Initialize the K-Means model using the best value for k
mdoel = KMeans(n_clusters=4,random_state=1)
# Fit the K-Means model using the PCA data
model.fit(df_market_data_pca)
KMeans(n_clusters=4, random_state=1)
# Predict the clusters to group the cryptocurrencies using the PCA data
k_3 = model.predict(df_market_data_pca)
# Print the resulting array of cluster values.
k_3
array([1, 1, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 1,
0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 2, 1, 0, 0, 3, 0, 0, 0, 0])
# Create a copy of the DataFrame with the PCA data
df_market_data_pca_predict = df_market_data_pca.copy()
# Add a new column to the DataFrame with the predicted clusters
df_market_data_pca_predict['market_segment'] = k_3
# Display sample data
df_market_data_pca_predict
| PCA1 | PCA2 | PCA3 | market_segment | |
|---|---|---|---|---|
| coin_id | ||||
| bitcoin | -0.600667 | 0.842760 | 0.461595 | 1 |
| ethereum | -0.458261 | 0.458466 | 0.952877 | 1 |
| tether | -0.433070 | -0.168126 | -0.641752 | 0 |
| ripple | -0.471835 | -0.222660 | -0.479053 | 0 |
| bitcoin-cash | -1.157800 | 2.041209 | 1.859715 | 1 |
| binancecoin | -0.516534 | 1.388377 | 0.804071 | 1 |
| chainlink | -0.450711 | 0.517699 | 2.846143 | 1 |
| cardano | -0.345600 | 0.729439 | 1.478013 | 1 |
| litecoin | -0.649468 | 0.432165 | 0.600303 | 1 |
| bitcoin-cash-sv | -0.759014 | -0.201200 | -0.217653 | 0 |
| crypto-com-chain | -0.248198 | -1.376252 | -1.462026 | 0 |
| usd-coin | -0.438408 | -0.175337 | -0.663388 | 0 |
| eos | -0.693425 | -0.473815 | -0.527597 | 0 |
| monero | 0.060499 | 2.909404 | 1.498571 | 1 |
| tron | -0.393352 | -0.108192 | -0.012756 | 0 |
| tezos | -0.796176 | -0.494409 | 1.082812 | 1 |
| okb | 0.064075 | -1.269825 | -1.098829 | 0 |
| stellar | -0.489015 | -0.732719 | -0.062543 | 0 |
| cosmos | -0.306272 | 0.703415 | 1.714224 | 1 |
| cdai | -0.513528 | -0.142802 | -0.656566 | 0 |
| neo | -0.362120 | -0.986914 | -0.728752 | 0 |
| wrapped-bitcoin | -0.604265 | 0.827398 | 0.439316 | 1 |
| leo-token | -0.413296 | -0.674115 | -1.076628 | 0 |
| huobi-token | -0.407483 | -0.212507 | -0.351426 | 0 |
| nem | 0.608974 | 0.563532 | -1.148742 | 0 |
| binance-usd | -0.450211 | -0.151019 | -0.647401 | 0 |
| iota | -0.764665 | -0.517886 | 0.204990 | 0 |
| vechain | -0.556315 | -1.938209 | -1.261776 | 0 |
| zcash | -0.425147 | 0.492976 | 1.058048 | 1 |
| theta-token | 2.676868 | -0.013954 | -1.965207 | 0 |
| dash | -0.613923 | -0.479337 | 0.339565 | 0 |
| ethereum-classic | -0.579924 | -0.356334 | -0.114942 | 0 |
| ethlend | 8.089018 | -3.896891 | 2.301382 | 2 |
| maker | -0.389045 | 0.165041 | 0.379414 | 1 |
| havven | 0.865762 | -2.261882 | 0.275583 | 0 |
| omisego | 0.111675 | 0.428316 | -1.205398 | 0 |
| celsius-degree-token | 4.792395 | 6.767679 | -1.986985 | 3 |
| ontology | -0.632355 | -2.108117 | -0.652227 | 0 |
| ftx-token | -0.593142 | 0.021485 | 0.209911 | 0 |
| true-usd | -0.458131 | -0.135734 | -0.635284 | 0 |
| digibyte | -0.297910 | -0.191126 | -0.909602 | 0 |
# Create a scatter plot using hvPlot by setting
# `x="PC1"` and `y="PC2"`.
# Color the graph points with the labels found using K-Means and
# add the crypto name in the `hover_cols` parameter to identify
# the cryptocurrency represented by each data point.
cluster_pca = df_market_data_pca_predict.hvplot.scatter(
x="PCA1",
y="PCA2",
by="market_segment"
)
df_market_data_pca_predict.hvplot.scatter(
x="PCA1",
y="PCA2",
by="market_segment"
)
In this section, you will visually analyze the cluster analysis results by contrasting the outcome with and without using the optimization techniques.
# Composite plot to contrast the Elbow curves
composit_elbow = elbow_pca + elbow_plot_orginal
composit_elbow
# Composite plot to contrast the clusters
composit_cluster = cluster_pca + cluster_plot_original
composit_cluster
Question: After visually analyzing the cluster analysis results, what is the impact of using fewer features to cluster the data using K-Means?
Answer: in terms of elbow curve the K means resulted the same number of segments, however, from cluster analysis we can see that the clusters are clearly distinguisible using fewer deatures to clusters ( original vs PCA clusert graph)